{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "008794c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "#问题一附件1语音业务数据集处理代码\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from plotnine import *\n",
    "# from joypy import joyplot\n",
    "from sklearn.feature_selection import SelectKBest\n",
    "# from minepy import MINE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "7be9b383",
   "metadata": {},
   "outputs": [],
   "source": [
    "#运行配置参数中的字体\n",
    "plt.rcParams['font.sans-serif'] = ['STSong']\n",
    "#运行配置参数总的轴(axes)正常显示正负号(minus)\n",
    "plt.rcParams['axes.unicode_minus'] = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c13823b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "#读取附件1数据\n",
    "data_1 = pd.read_excel('./data/附件1语音业务用户满意度数据.xlsx')\n",
    "data_1\n",
    "#计算每一列缺失值的占比\n",
    "def missing (df):\n",
    "    missing_number = df.isnull().sum().sort_values(ascending=False)\n",
    "    missing_percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)\n",
    "    missing_values = pd.concat([missing_number,missing_percent],axis=1,keys=['缺失值数量','缺失值所占百分比'])\n",
    "    return missing_values\n",
    " \n",
    "missing(data_1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a672e2f9",
   "metadata": {},
   "outputs": [],
   "source": [
    "#删除冗余特征、无用特征\n",
    "data_1 = data_1.drop('用户id', axis = 1)\n",
    "data_1 = data_1.drop('用户描述', axis = 1)\n",
    "data_1 = data_1.drop('用户描述.1', axis = 1)\n",
    "data_1 = data_1.drop('重定向次数', axis = 1)\n",
    "data_1 = data_1.drop('重定向驻留时长', axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "30ddeb5e",
   "metadata": {},
   "outputs": [],
   "source": [
    "#缺失值填补\n",
    "#是否关怀用户和是否去过营业厅，都用否来填充\n",
    "data_1['是否关怀用户'].fillna('否', inplace=True)\n",
    "data_1['是否去过营业厅'].fillna('否', inplace=True)\n",
    "# 选择用众数填充\n",
    "data_1['是否4G网络客户（本地剔除物联网）'].fillna('是', inplace = True)\n",
    "data_1['终端品牌'].value_counts()\n",
    "data_1['终端品牌'].fillna('苹果', inplace = True)\n",
    "data_1['终端品牌类型'].value_counts()\n",
    "data_1['终端品牌类型'].fillna('A2223', inplace = True)\n",
    "data_1['是否5G网络客户'].fillna('否', inplace = True)\n",
    "data_1['是否实名登记用户'].fillna('否', inplace = True)\n",
    "data_1['客户星级标识'].value_counts()\n",
    "data_1['客户星级标识'].fillna('三星', inplace = True)\n",
    "# 选择用0填充\n",
    "data_1['外省流量占比'].fillna(0.0 , inplace = True)\n",
    "data_1['当月欠费金额'].fillna(0.00, inplace = True)\n",
    "data_1['前第3个月欠费金额'].fillna(0.00, inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1dd1de8d",
   "metadata": {},
   "outputs": [],
   "source": [
    " #检查是否还有缺失值\n",
    "plt.rcParams['font.sans-serif'] = ['STSong']\n",
    "import missingno as msn\n",
    "msn.matrix(data_1, labels = True)\n",
    "msn.bar(data_1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "866443df",
   "metadata": {},
   "outputs": [],
   "source": [
    "#数据集的填充替换\n",
    "#首先是对【'居民小区', '办公室', '高校', '商业街', '地铁', '农村', '高铁'】进行替换，-1表示“不在”，其他值（2，3，4，5，6，7）表示“在”\n",
    "cols = ['居民小区', '办公室', '高校', '商业街', '地铁', '农村', '高铁']\n",
    "j = 0\n",
    "for col in cols:\n",
    "        j += 1\n",
    "        data_1[col].replace(-1, '不在', inplace = True)\n",
    "        data_1[col].replace(j, '在', inplace = True)\n",
    "data_1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "49d11cf4",
   "metadata": {},
   "outputs": [],
   "source": [
    "#检查列名\n",
    "data_1.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "73d3c28c",
   "metadata": {},
   "outputs": [],
   "source": [
    "#“其他，请注明”和“其他，请注明.1”也需要进行替换，98表示“有”，-1表示没有\n",
    "data_1['其他，请注明.1'].replace(98, '有', inplace = True)\n",
    "data_1['其他，请注明.1'].replace(-1, '没有', inplace = True)\n",
    "data_1['其他，请注明'].replace(98, '有', inplace = True)\n",
    "data_1['其他，请注明'].replace(-1, '没有', inplace = True)\n",
    "#“是否遇到网络问题”中，2表示否，1表示“是”\n",
    "data_1['是否遇到过网络问题'].replace(2, '否', inplace = True)\n",
    "data_1['是否遇到过网络问题'].replace(1, '是', inplace = True)\n",
    "#['手机没有信号', '有信号无法拨通', '通话过程中突然中断', '通话中有杂音、听不清、断断续续', '串线', '通话过程中一方听不见']中，-1表示没有，1，2，3，4，5，6表示“有”\n",
    "cols = ['手机没有信号', '有信号无法拨通', '通话过程中突然中断', '通话中有杂音、听不清、断断续续', '串线', '通话过程中一方听不见']\n",
    "j = 0\n",
    "for col in cols:\n",
    "        j += 1\n",
    "        data_1[col].replace(-1, '没有', inplace = True)\n",
    "        data_1[col].replace(j, '有', inplace = True)\n",
    "data_1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a14fdba3",
   "metadata": {},
   "outputs": [],
   "source": [
    "#删除“终端品牌类型”这个无关特征\n",
    "data_1 = data_1.drop('终端品牌类型', axis = 1)\n",
    "data_1.to_excel('./data_1处理后的数据.xlsx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "912d793f",
   "metadata": {},
   "outputs": [],
   "source": [
    "#根据“语音通话整体满意度”进行编码\n",
    "import category_encoders as ce\n",
    "df_1 = data_1.copy()\n",
    "target = df_1[['语音通话整体满意度']]\n",
    "train = df_1.drop(['语音通话整体满意度', '网络覆盖与信号强度', '语音通话清晰度', '语音通话稳定性'], axis = 1)\n",
    "cbe_encoder = ce.cat_boost.CatBoostEncoder()\n",
    "cbe_encoder.fit(train, target)\n",
    "data = pd.DataFrame(cbe_encoder.transform(train))\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6fc6ea50",
   "metadata": {},
   "outputs": [],
   "source": [
    "data['客户星级标识'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "03cd8ecc",
   "metadata": {},
   "outputs": [],
   "source": [
    "#随机森林进行特征重要性提取\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "X, y = data, df_1.iloc[:, 0]\n",
    "feat_labels = df_1.columns[4 : ]\n",
    "forest = RandomForestClassifier(n_estimators = 300 ,random_state = 10)\n",
    "forest.fit(X, y)\n",
    "importances = pd.Series(forest.feature_importances_, index = feat_labels).sort_values(ascending = False)\n",
    "importances"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "35be95fc",
   "metadata": {},
   "outputs": [],
   "source": [
    "#语音通话整体满意度\n",
    "#互信息\n",
    "def mic(x, y):\n",
    "    m = MINE()\n",
    "    m.compute_score(x, y)\n",
    "    return (m.mic(), 0.5)\n",
    "index_4 = np.array(data.columns)\n",
    "lst = []\n",
    "for i in index_4:\n",
    "    l = mic(data[i], df_1['语音通话整体满意度'])[0]\n",
    "    lst.append(l)\n",
    "w1 = pd.Series(pd.Series(lst).values, index = index_4).sort_values(ascending = False)\n",
    "w1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1bdef5b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "index = np.array(w1.index)\n",
    "MIC = pd.DataFrame()\n",
    "MIC['变量'] = index\n",
    "MIC['互信息值'] = w1.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "af766646",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.rcParams['font.sans-serif'] = ['STSong']\n",
    "plt.rcParams['figure.dpi'] = 1000\n",
    "g = sns.catplot(data = MIC.iloc[: 20, :], x = '互信息值', y = '变量', kind = 'bar', height = 8, aspect = 1.875)\n",
    "plt.title('语音通话整体满意度的互信息值', font={'family' : 'STSong', 'size' : 14})\n",
    "plt.ylabel('变量', font={'family':'STSong', 'size':14})\n",
    "plt.xlabel('互信息值', font={'family':'STSong', 'size':14})\n",
    "df = MIC.iloc[: 20, :]\n",
    "C = np.array(df['互信息值'])\n",
    "for a, b, c in zip(df['互信息值'], range(20), C):\n",
    "\tplt.text(a + 0.0001, b, '%.3f'%c, fontsize = 14)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0bd0d89c",
   "metadata": {},
   "outputs": [],
   "source": [
    "#使用GBDT算法提取特征重要性\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "clf = GradientBoostingClassifier(n_estimators = 300, random_state = 10)\n",
    "clf.fit(X, y)\n",
    "var_importance = pd.Series(clf.feature_importances_, index = feat_labels).sort_values(ascending = False)\n",
    "var_importance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "04f605be",
   "metadata": {},
   "outputs": [],
   "source": [
    "index = np.array(var_importance.index)\n",
    "feature_importance = pd.DataFrame()\n",
    "feature_importance['feature_names'] = index\n",
    "feature_importance['feature_values'] = var_importance.values\n",
    "feature_importance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "29f2fe81",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.rcParams['font.sans-serif'] = ['STSong']\n",
    "plt.rcParams['figure.dpi'] = 1000\n",
    "g = sns.catplot(data = feature_importance.iloc[: 20, :], x = 'feature_values', y = 'feature_names', kind = 'bar', height = 8, aspect = 1.875)\n",
    "plt.title('语音通话整体满意度', font={'family' : 'STSong', 'size' : 14})\n",
    "plt.ylabel('feature_names', font={'family':'STSong', 'size':14})\n",
    "plt.xlabel('feature_values', font={'family':'STSong', 'size':14})\n",
    "df = feature_importance.iloc[: 20, :]\n",
    "C = np.array(df['feature_values'])\n",
    "for a, b, c in zip(df['feature_values'], range(20), C):\n",
    "\tplt.text(a + 0.0001, b, '%.3f'%c, fontsize = 14)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13916643",
   "metadata": {},
   "outputs": [],
   "source": [
    "#网络覆盖与信号强度\n",
    "#互信息\n",
    "df_2 = data_1.copy()\n",
    "target = df_2[['网络覆盖与信号强度']]\n",
    "train = df_1.drop(['语音通话整体满意度', '网络覆盖与信号强度', '语音通话清晰度', '语音通话稳定性'], axis = 1)\n",
    "cbe_encoder = ce.cat_boost.CatBoostEncoder()\n",
    "cbe_encoder.fit(train, target)\n",
    "data2 = pd.DataFrame(cbe_encoder.transform(train))\n",
    "data2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c9b0b024",
   "metadata": {},
   "outputs": [],
   "source": [
    "def mic(x, y):\n",
    "    m = MINE()\n",
    "    m.compute_score(x, y)\n",
    "    return (m.mic(), 0.5)\n",
    " \n",
    "index_4 = np.array(data2.columns)\n",
    "lst = []\n",
    "for i in index_4:\n",
    "    l = mic(data2[i], df_2['网络覆盖与信号强度'])[0]\n",
    "    lst.append(l)\n",
    "w2 = pd.Series(pd.Series(lst).values, index = index_4).sort_values(ascending = False)\n",
    "w2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e5ad2f5",
   "metadata": {},
   "outputs": [],
   "source": [
    "index = np.array(w2.index)\n",
    "MIC2 = pd.DataFrame()\n",
    "MIC2['变量'] = index\n",
    "MIC2['互信息值'] = w2.values\n",
    "plt.rcParams['font.sans-serif'] = ['STSong']\n",
    "plt.rcParams['figure.dpi'] = 1000\n",
    "g = sns.catplot(data = MIC2.iloc[: 20, :], x = '互信息值', y = '变量', kind = 'bar', height = 8, aspect = 1.875)\n",
    "plt.title('网络覆盖与信号强度的互信息值', font={'family' : 'STSong', 'size' : 14})\n",
    "plt.ylabel('变量', font={'family':'STSong', 'size':14})\n",
    "plt.xlabel('互信息值', font={'family':'STSong', 'size':14})\n",
    "df = MIC2.iloc[: 20, :]\n",
    "C = np.array(df['互信息值'])\n",
    "for a, b, c in zip(df['互信息值'], range(20), C):\n",
    "\tplt.text(a + 0.0001, b, '%.3f'%c, fontsize = 14)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "44970b2e",
   "metadata": {},
   "outputs": [],
   "source": [
    "#使用GBDT算法提取特征重要性\n",
    "X, y = data2, df_2.iloc[:, 1]\n",
    "feat_labels = df_2.columns[4 : ]\n",
    "clf2 = GradientBoostingClassifier(n_estimators = 300, random_state = 10)\n",
    "clf2.fit(X, y)\n",
    "var_importance2 = pd.Series(clf2.feature_importances_, index = feat_labels).sort_values(ascending = False)\n",
    "var_importance2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5558f3be",
   "metadata": {},
   "outputs": [],
   "source": [
    "index2 = np.array(var_importance2.index)\n",
    "feature_importance2 = pd.DataFrame()\n",
    "feature_importance2['feature_names'] = index2\n",
    "feature_importance2['feature_values'] = var_importance2.values\n",
    "feature_importance2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3f497c72",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.rcParams['font.sans-serif'] = ['STSong']\n",
    "plt.rcParams['figure.dpi'] = 1000\n",
    "g = sns.catplot(data = feature_importance2.iloc[: 20, :], x = 'feature_values', y = 'feature_names', kind = 'bar', height = 8, aspect = 1.875)\n",
    "plt.title('网络覆盖与信号强度', font={'family' : 'STSong', 'size' : 14})\n",
    "plt.ylabel('feature_names', font={'family':'STSong', 'size':14})\n",
    "plt.xlabel('feature_values', font={'family':'STSong', 'size':14})\n",
    "df = feature_importance2.iloc[: 20, :]\n",
    "C = np.array(df['feature_values'])\n",
    "for a, b, c in zip(df['feature_values'], range(20), C):\n",
    "\tplt.text(a + 0.0001, b, '%.3f'%c, fontsize = 14)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9cf38198",
   "metadata": {},
   "outputs": [],
   "source": [
    "#语音通话清晰度\n",
    "#互信息\n",
    "df_3 = data_1.copy()\n",
    "target = df_3[['语音通话清晰度']]\n",
    "train = df_1.drop(['语音通话整体满意度', '网络覆盖与信号强度', '语音通话清晰度', '语音通话稳定性'], axis = 1)\n",
    "cbe_encoder = ce.cat_boost.CatBoostEncoder()\n",
    "cbe_encoder.fit(train, target)\n",
    "data3 = pd.DataFrame(cbe_encoder.transform(train))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2270f7d2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def mic(x, y):\n",
    "    m = MINE()\n",
    "    m.compute_score(x, y)\n",
    "    return (m.mic(), 0.5)\n",
    "index_4 = np.array(data3.columns)\n",
    "lst = []\n",
    "for i in index_4:\n",
    "    l = mic(data3[i], df_3['语音通话清晰度'])[0]\n",
    "    lst.append(l)\n",
    "w3 = pd.Series(pd.Series(lst).values, index = index_4).sort_values(ascending = False)\n",
    "w3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8964d2d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "index = np.array(w3.index)\n",
    "MIC3 = pd.DataFrame()\n",
    "MIC3['变量'] = index\n",
    "MIC3['互信息值'] = w3.values\n",
    "plt.rcParams['font.sans-serif'] = ['STSong']\n",
    "plt.rcParams['figure.dpi'] = 1000\n",
    "g = sns.catplot(data = MIC3.iloc[: 20, :], x = '互信息值', y = '变量', kind = 'bar', height = 8, aspect = 1.875)\n",
    "plt.title('语音通话清晰度的互信息值', font={'family' : 'STSong', 'size' : 14})\n",
    "plt.ylabel('变量', font={'family':'STSong', 'size':14})\n",
    "plt.xlabel('互信息值', font={'family':'STSong', 'size':14})\n",
    "df = MIC3.iloc[: 20, :]\n",
    "C = np.array(df['互信息值'])\n",
    "for a, b, c in zip(df['互信息值'], range(20), C):\n",
    "\tplt.text(a + 0.0001, b, '%.3f'%c, fontsize = 14)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0c89bc37",
   "metadata": {},
   "outputs": [],
   "source": [
    "#使用GBDT算法提取特征重要性\n",
    "X, y = data3, df_3.iloc[:, 2]\n",
    "feat_labels = df_3.columns[4 : ]\n",
    "clf3 = GradientBoostingClassifier(n_estimators = 300, random_state = 10)\n",
    "clf3.fit(X, y)\n",
    "var_importance3 = pd.Series(clf3.feature_importances_, index = feat_labels).sort_values(ascending = False)\n",
    "var_importance3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "43d3095f",
   "metadata": {},
   "outputs": [],
   "source": [
    "index3 = np.array(var_importance3.index)\n",
    "feature_importance3 = pd.DataFrame()\n",
    "feature_importance3['feature_names'] = index3\n",
    "feature_importance3['feature_values'] = var_importance3.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2007695f",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.rcParams['font.sans-serif'] = ['STSong']\n",
    "plt.rcParams['figure.dpi'] = 1000\n",
    "g = sns.catplot(data = feature_importance3.iloc[: 20, :], x = 'feature_values', y = 'feature_names', kind = 'bar', height = 8, aspect = 1.875)\n",
    "plt.title('语音通话清晰度', font={'family' : 'STSong', 'size' : 14})\n",
    "plt.ylabel('feature_names', font={'family':'STSong', 'size':14})\n",
    "plt.xlabel('feature_values', font={'family':'STSong', 'size':14})\n",
    "df = feature_importance3.iloc[: 20, :]\n",
    "C = np.array(df['feature_values'])\n",
    "for a, b, c in zip(df['feature_values'], range(20), C):\n",
    "\tplt.text(a + 0.0001, b, '%.3f'%c, fontsize = 14)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "08b2d86e",
   "metadata": {},
   "outputs": [],
   "source": [
    "#语音通话稳定性\n",
    "#互信息\n",
    "df_4 = data_1.copy()\n",
    "target = df_4[['语音通话稳定性']]\n",
    "train = df_1.drop(['语音通话整体满意度', '网络覆盖与信号强度', '语音通话清晰度', '语音通话稳定性'], axis = 1)\n",
    "cbe_encoder = ce.cat_boost.CatBoostEncoder()\n",
    "cbe_encoder.fit(train, target)\n",
    "data4 = pd.DataFrame(cbe_encoder.transform(train))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7ee0e69a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def mic(x, y):\n",
    "    m = MINE()\n",
    "    m.compute_score(x, y)\n",
    "    return (m.mic(), 0.5)\n",
    "index_4 = np.array(data4.columns)\n",
    "lst = []\n",
    "for i in index_4:\n",
    "    l = mic(data4[i], df_4['语音通话稳定性'])[0]\n",
    "    lst.append(l)\n",
    "w4 = pd.Series(pd.Series(lst).values, index = index_4).sort_values(ascending = False)\n",
    "w4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7a93b62d",
   "metadata": {},
   "outputs": [],
   "source": [
    "index = np.array(w4.index)\n",
    "MIC4 = pd.DataFrame()\n",
    "MIC4['变量'] = index\n",
    "MIC4['互信息值'] = w4.values\n",
    "plt.rcParams['font.sans-serif'] = ['STSong']\n",
    "plt.rcParams['figure.dpi'] = 1000\n",
    "g = sns.catplot(data = MIC4.iloc[: 20, :], x = '互信息值', y = '变量', kind = 'bar', height = 8, aspect = 1.875)\n",
    "plt.title('语音通话清晰度的互信息值', font={'family' : 'STSong', 'size' : 14})\n",
    "plt.ylabel('变量', font={'family':'STSong', 'size':14})\n",
    "plt.xlabel('互信息值', font={'family':'STSong', 'size':14})\n",
    "df = MIC4.iloc[: 20, :]\n",
    "C = np.array(df['互信息值'])\n",
    "for a, b, c in zip(df['互信息值'], range(20), C):\n",
    "\tplt.text(a + 0.0001, b, '%.3f'%c, fontsize = 14)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7759bd24",
   "metadata": {},
   "outputs": [],
   "source": [
    "#使用GBDT算法提取特征重要性\n",
    "X, y = data4, df_4.iloc[:, 2]\n",
    "feat_labels = df_4.columns[4 : ]\n",
    "clf4 = GradientBoostingClassifier(n_estimators = 300, random_state = 10)\n",
    "clf4.fit(X, y)\n",
    "var_importance4 = pd.Series(clf4.feature_importances_, index = feat_labels).sort_values(ascending = False)\n",
    "var_importance4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20dd0e63",
   "metadata": {},
   "outputs": [],
   "source": [
    "index4 = np.array(var_importance4.index)\n",
    "feature_importance4 = pd.DataFrame()\n",
    "feature_importance4['feature_names'] = index4\n",
    "feature_importance4['feature_values'] = var_importance4.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a18d780c",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.rcParams['font.sans-serif'] = ['STSong']\n",
    "plt.rcParams['figure.dpi'] = 1000\n",
    "g = sns.catplot(data = feature_importance4.iloc[: 20, :], x = 'feature_values', y = 'feature_names', kind = 'bar', height = 8, aspect = 1.875)\n",
    "plt.title('语音通话稳定性', font={'family' : 'STSong', 'size' : 14})\n",
    "plt.ylabel('feature_names', font={'family':'STSong', 'size':14})\n",
    "plt.xlabel('feature_values', font={'family':'STSong', 'size':14})\n",
    "df = feature_importance4.iloc[: 20, :]\n",
    "C = np.array(df['feature_values'])\n",
    "for a, b, c in zip(df['feature_values'], range(20), C):\n",
    "\tplt.text(a + 0.0001, b, '%.3f'%c, fontsize = 14)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b7d04690",
   "metadata": {},
   "outputs": [],
   "source": [
    "#相关系数图\n",
    "data_1.iloc[:,1:5].corr()\n",
    "# Plot\n",
    "def plot_corr(y_X):\n",
    "    plt.figure(figsize=(18,3.4), dpi= 1000)\n",
    "    titles = ['pearson','kendall','spearman']\n",
    "    for i,title in enumerate(titles):\n",
    "        plt.subplot(1, 3, i+1)\n",
    "        sns.heatmap(y_X.corr(title),  # 相关系数矩阵\n",
    "                    xticklabels=y_X.corr().columns, # 横坐标标签\n",
    "                    yticklabels=y_X.corr().columns, # 横坐标标签\n",
    "                    cmap='copper', \n",
    "                    center=0, \n",
    "                    square=True,\n",
    "                    annot=True\n",
    "                    ) \n",
    "        \n",
    "        plt.title(label=title+'相关系数', fontsize=12)\n",
    "        plt.xticks(fontsize=10,\n",
    "                   rotation=45,# 旋转\n",
    "                   horizontalalignment='right') # 刻度的相对位置\n",
    "        plt.yticks(fontsize=10,rotation=0,)\n",
    "    plt.show()\n",
    " \n",
    "plot_corr(data_1.iloc[ :,1:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d2d03998",
   "metadata": {},
   "outputs": [],
   "source": [
    "#热力图\n",
    "data1 = data.loc[:,index[:20]]\n",
    "plt.figure(figsize = (15, 15), dpi = 1000)\n",
    "sns.heatmap(data1.corr(), cmap='YlGnBu')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "83b7d9f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "#问题一量化分析图\n",
    "#是否遇到过网络问题\n",
    "plt.figure(figsize = (15, 15), dpi = 1000)\n",
    "plt.subplot(4, 2, 1)\n",
    "sns.countplot(data = data_1, x = '语音通话整体满意度', hue = '是否遇到过网络问题')\n",
    "df1 = data_1.groupby(['语音通话整体满意度', '是否遇到过网络问题']).size().reset_index(name='Count')\n",
    "no1 = [15,3,3,0,14,12,27,122,270,2107]\n",
    "yes1 = (df1.loc[df1['是否遇到过网络问题'] == '是']).loc[:,'Count']\n",
    "x1 = list(range(len(yes1)))\n",
    "for a,b in zip(x1, no1): ##控制标签位置\n",
    "    plt.text(a+0.18, b+0.1, '%.0f'%b, ha = 'center', va = 'bottom', fontsize = 8)\n",
    "for a,b in zip(x1, yes1):\n",
    "    plt.text(a-0.2, b+0.1, '%.0f'%b, ha = 'center', va = 'bottom', fontsize = 8)\n",
    "plt.subplot(4, 2, 2)\n",
    "sns.countplot(data = data_1, x = '网络覆盖与信号强度', hue = '是否遇到过网络问题')\n",
    "df2 = data_1.groupby(['网络覆盖与信号强度', '是否遇到过网络问题']).size().reset_index(name='Count')\n",
    "no2 = (df2.loc[df2['是否遇到过网络问题'] == '否']).loc[:,'Count']\n",
    "yes2 = (df2.loc[df2['是否遇到过网络问题'] == '是']).loc[:,'Count']\n",
    "x2 = list(range(len(yes2)))\n",
    "for a,b in zip(x2, no2): ##控制标签位置\n",
    "    plt.text(a+0.18,b+0.1,'%.0f'%b,ha = 'center',va = 'bottom', fontsize = 8)\n",
    "for a,b in zip(x2, yes2):\n",
    "    plt.text(a-0.2, b+0.1, '%.0f'%b, ha = 'center', va = 'bottom', fontsize = 8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b039b826",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.subplot(4, 2, 3)\n",
    "sns.countplot(data = data_1, x = '语音通话清晰度', hue = '是否遇到过网络问题')\n",
    "df3 = data_1.groupby(['语音通话清晰度', '是否遇到过网络问题']).size().reset_index(name='Count')\n",
    "no3 = (df3.loc[df3['是否遇到过网络问题'] == '否']).loc[:,'Count']\n",
    "yes3 = (df3.loc[df3['是否遇到过网络问题'] == '是']).loc[:,'Count']\n",
    "x3 = list(range(len(yes3)))\n",
    "for a,b in zip(x3, no3): ##控制标签位置\n",
    "    plt.text(a+0.18, b+0.1, '%.0f'%b, ha = 'center', va = 'bottom', fontsize = 8)\n",
    "for a,b in zip(x3, yes3):\n",
    "    plt.text(a-0.2, b+0.1, '%.0f'%b, ha = 'center', va = 'bottom', fontsize = 8)\n",
    "    \n",
    "    \n",
    "plt.subplot(4, 2, 4)\n",
    "sns.countplot(data = data_1, x = '语音通话稳定性', hue = '是否遇到过网络问题')\n",
    "df4 = data_1.groupby(['语音通话稳定性', '是否遇到过网络问题']).size().reset_index(name='Count')\n",
    "no4 = (df4.loc[df4['是否遇到过网络问题'] == '否']).loc[:,'Count']\n",
    "yes4 = (df4.loc[df4['是否遇到过网络问题'] == '是']).loc[:,'Count']\n",
    "x4 = list(range(len(yes4)))\n",
    "for a,b in zip(x4, no4): ##控制标签位置\n",
    "    plt.text(a+0.18, b+0.1, '%.0f'%b, ha = 'center', va = 'bottom', fontsize = 8)\n",
    "for a,b in zip(x4, yes4):\n",
    "    plt.text(a-0.2, b+0.1, '%.0f'%b, ha = 'center', va = 'bottom', fontsize = 8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ce4b9f3d",
   "metadata": {},
   "outputs": [],
   "source": [
    "#GPRS总流量（GB）\n",
    "max1 = (data_1.loc[data_1.groupby('语音通话整体满意度')['GPRS总流量（KB）'].idxmax(), :].reset_index())['GPRS总流量（KB）']/(1024*1024)\n",
    "score = pd.DataFrame(np.arange(1, 11))\n",
    "df1 = pd.concat([score, max1], axis = 1)\n",
    "df1.columns = ['语音通话整体满意度', 'GPRS_max(GB)']\n",
    " \n",
    "max2 = (data_1.loc[data_1.groupby('网络覆盖与信号强度')['GPRS总流量（KB）'].idxmax(), :].reset_index())['GPRS总流量（KB）']/(1024*1024)\n",
    "score = pd.DataFrame(np.arange(1, 11))\n",
    "df2 = pd.concat([score, max2], axis = 1)\n",
    "df2.columns = ['网络覆盖与信号强度', 'GPRS_max(GB)']\n",
    " \n",
    "max3 = (data_1.loc[data_1.groupby('语音通话清晰度')['GPRS总流量（KB）'].idxmax(), :].reset_index())['GPRS总流量（KB）']/(1024*1024)\n",
    "score = pd.DataFrame(np.arange(1, 11))\n",
    "df3 = pd.concat([score, max3], axis = 1)\n",
    "df3.columns = ['语音通话清晰度', 'GPRS_max(GB)']\n",
    " \n",
    "max4 = ((data_1.loc[data_1.groupby('语音通话稳定性')['GPRS总流量（KB）'].idxmax(), :].reset_index())['GPRS总流量（KB）'])/(1024*1024)\n",
    "score = pd.DataFrame(np.arange(1, 11))\n",
    "df4 = pd.concat([score, max4], axis = 1)\n",
    "df4.columns = ['语音通话稳定性', 'GPRS_max(GB)']\n",
    " \n",
    "newdf = pd.concat([df1['语音通话整体满意度'], df1['GPRS_max(GB)'], df2['GPRS_max(GB)'], df3['GPRS_max(GB)'], df4['GPRS_max(GB)']], axis=1)\n",
    "newdf.columns = ['打分', '语音通话整体满意度', '网络覆盖与信号强度', '语音通话清晰度', '语音通话稳定性']\n",
    "newdf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dad7f43b",
   "metadata": {},
   "outputs": [],
   "source": [
    "#当月ARPU\n",
    "max1 = (data_1.loc[data_1.groupby('语音通话整体满意度')['当月ARPU'].idxmax(), :].reset_index())['当月ARPU']\n",
    "min1 = (data_1.loc[data_1.groupby('语音通话整体满意度')['当月ARPU'].idxmin(), :].reset_index())['当月ARPU']\n",
    "score = pd.DataFrame(np.arange(1, 11))\n",
    "df1 = pd.concat([score,min1,max1], axis = 1)\n",
    "df1.columns = ['语音通话整体满意度','当月ARPU_min','当月ARPU_max']\n",
    " \n",
    "max2 = (data_1.loc[data_1.groupby('网络覆盖与信号强度')['当月ARPU'].idxmax(), :].reset_index())['当月ARPU']\n",
    "min2 = (data_1.loc[data_1.groupby('网络覆盖与信号强度')['当月ARPU'].idxmin(), :].reset_index())['当月ARPU']\n",
    "score = pd.DataFrame(np.arange(1, 11))\n",
    "df2 = pd.concat([score,min2,max2], axis = 1)\n",
    "df2.columns = ['网络覆盖与信号强度','当月ARPU_min','当月ARPU_max']\n",
    " \n",
    "max3 = (data_1.loc[data_1.groupby('语音通话清晰度')['当月ARPU'].idxmax(), :].reset_index())['当月ARPU']\n",
    "min3 = (data_1.loc[data_1.groupby('语音通话清晰度')['当月ARPU'].idxmin(), :].reset_index())['当月ARPU']\n",
    "score = pd.DataFrame(np.arange(1, 11))\n",
    "df3 = pd.concat([score,min3,max3], axis = 1)\n",
    "df3.columns = ['语音通话清晰度','当月ARPU_min','当月ARPU_max']\n",
    " \n",
    "max4 = (data_1.loc[data_1.groupby('语音通话稳定性')['当月ARPU'].idxmax(), :].reset_index())['当月ARPU']\n",
    "min4 = (data_1.loc[data_1.groupby('语音通话稳定性')['当月ARPU'].idxmin(), :].reset_index())['当月ARPU']\n",
    "score = pd.DataFrame(np.arange(1, 11))\n",
    "df4 = pd.concat([score,min4,max4], axis = 1)\n",
    "df4.columns = ['语音通话稳定性','当月ARPU_min','当月ARPU_max']\n",
    " \n",
    "newdf = pd.concat([df1['语音通话整体满意度'], df1['当月ARPU_max'], df2['当月ARPU_max'], df3['当月ARPU_max'], df4['当月ARPU_max']], axis=1)\n",
    "newdf.columns = ['打分', '语音通话整体满意度', '网络覆盖与信号强度', '语音通话清晰度', '语音通话稳定性']\n",
    "newdf"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
